<!--begin.rcode custom_load, results='hide', echo=FALSE, message=FALSE
library(caret)
library(kernlab)
library(class)
library(caTools)
library(pROC)
library(DMwR)
library(randomForest)
library(reshape2)
library(plyr)
library(pls)
library(parallel)
library(doMC)
registerDoMC(cores=detectCores()-1)

theme1 <- caretTheme()
theme1$superpose.symbol$col = c(rgb(1, 0, 0, .4), rgb(0, 0, 1, .4), 
                                rgb(0.3984375, 0.7578125,0.6445312, .6))
theme1$superpose.symbol$pch = c(15, 16, 17)
theme1$superpose.cex = .8
theme1$superpose.line$col = c(rgb(1, 0, 0, .9), rgb(0, 0, 1, .9), rgb(0.3984375, 0.7578125,0.6445312, .6))
theme1$superpose.line$lwd <- 2
theme1$superpose.line$lty = 1:3
theme1$plot.symbol$col = c(rgb(.2, .2, .2, .4))
theme1$plot.symbol$pch = 16
theme1$plot.cex = .8
theme1$plot.line$col = c(rgb(1, 0, 0, .7))
theme1$plot.line$lwd <- 2
theme1$plot.line$lty = 1

i <- 1

hook_inline = knit_hooks$get('inline')
knit_hooks$set(inline = function(x) {
  if (is.character(x)) highr::hi_html(x) else hook_inline(x)
  })
opts_chunk$set(comment=NA, digits = 4,warning=FALSE,message=FALSE)

library(ellipse)
library(mlbench)
data(Sonar)
session <- paste(format(Sys.time(), "%a %b %d %Y"),
                 "using caret version",
                 packageDescription("caret")$Version,
                 "and",
                 R.Version()$version.string)
#    options(digits=2)
    end.rcode-->

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
  <!--
  Design by Free CSS Templates
http://www.freecsstemplates.org
Released for free under a Creative Commons Attribution 2.5 License

Name       : Emerald 
Description: A two-column, fixed-width design with dark color scheme.
Version    : 1.0
Released   : 20120902

-->
  <html xmlns="http://www.w3.org/1999/xhtml">
  <head>
  <meta name="keywords" content="" />
  <meta name="description" content="" />
  <meta http-equiv="content-type" content="text/html; charset=utf-8" />
  <title>Using Your Own Model in train</title>
  <link href='http://fonts.googleapis.com/css?family=Abel' rel='stylesheet' type='text/css'>
  <link href="style.css" rel="stylesheet" type="text/css" media="screen" />
  </head>
  <body>
  <div id="wrapper">
  <div id="header-wrapper" class="container">
  <div id="header" class="container">
  <div id="logo">
  <h1><a href="#">Using Your Own Model in train</a></h1>
</div>
  <!--
  <div id="menu">
  <ul>
  <li class="current_page_item"><a href="#">Homepage</a></li>
<li><a href="#">Blog</a></li>
<li><a href="#">Photos</a></li>
<li><a href="#">About</a></li>
<li><a href="#">Contact</a></li>
</ul>
  </div>
  -->
  </div>
  <div><img src="images/img03.png" width="1000" height="40" alt="" /></div>
  </div>
  <!-- end #header -->
<div id="page">
  <div id="content">

<h1>Contents</h1>  
<ul>
  <li>
  <a href="#Introduction">Introduction</a>
  </li>
  <li>
  <a href="#Illustration1">Illustrative Example 1: SVMs with Laplacian Kernels</a>
  </li>
  <li>
  <a href="#Components">Model Components</a>
  </li>
  <li>
  <a href="#Illustration2">Illustrative Example 2: Something More Complicated - LogitBoost</a>
  </li>
  <li>
  <a href="#Illustration3">Illustrative Example 3: Nonstandard Formulas</a>  </li>
  </li>
  <li>
  <a href="#Illustration4">Illustrative Example 4: PLS Feature Extraction Pre-Processing</a>      
  </li> 
  <li>
  <a href="#Illustration5">Illustrative Example 5: Optimizing probability thresholds for class imbalances </a>      
  </li>   
 </ul>   
  
<div id="Introduction"></div>
<h1>Introduction</h1>  
<p>
The package contains a large number of predictive model interfaces. However, you may want to create your own because:
</p>
<ul>
  <li> you are testing out a novel model or the package doesn't have a model that you are interested in
  </li>
  <li> you would like to run an existing model in the package your own way
  </li>
  <li> there are pre-processing or sampling steps not contained in the package or
you just don't like the way the package does things
  </li>
</ul>
<p>
You can still get the benefits of the <a href="http://cran.r-project.org/web/packages/caret/index.html"><strong>caret</strong></a>  infrastructure by creating your own model.
</p>
<p>Currently, when you specify the type of model that you are interested in (e.g. <tt><!--rinline 'type = "lda"'  --></tt>), the  <span class="mx funCall">train</span> function runs another function called <span class="mx funCall">getModelInfo</span> to retrieve the specifics of that model from the existing catalog. For example:
</p>
<!--begin.rcode custom_modelInfo 
ldaModelInfo <- getModelInfo(model = "lda", regex = FALSE)[[1]]
## Model components
names(ldaModelInfo)
    end.rcode--> 
<p>
To use your own model, you can pass a list of these components to <tt><!--rinline 'type'  --></tt>. This page will describe those components in detail.
</p>

<div id="Illustration1"></div>
<h1>Illustrative Example 1: SVMs with Laplacian Kernels</h1>
<p>
The package currently contains support vector machine (SVM) models using linear, polynomial and radial basis function kernels. The <a href="http://cran.r-project.org/web/packages/kernlab/index.html"><strong>kernlab</strong></a>  package has other functions, including the Laplacian kernel. We will illustrate the model components for this model, which has two parameters: the standard cost parameter for SVMs and one kernel parameter (<tt>sigma</tt>)
</p>


<div id="Components"></div>
<h1>Model Components</h1>


<p> 
You can pass a list of information to the <span class="mx arg">method</span> argument in <span class="mx funCall">train</span>. For models that are built-in to the package, you can just pass the method name as before. 
</p>
<p> 
There are some basic components of the list for custom models. A brief description is below for each then, after setting up and example, each will be described in detail. The list should have the following elements:
</p>
<ul>
  <li>
  <span class="mx arg">library</span> is a character vector of package names that will be needed to fit the model or calculate predictions. <tt><!--rinline 'NULL' --></tt> can also be used.
  </li>
  <li>
  <span class="mx arg">type</span> is a simple character vector with values <tt><!--rinline '"Classification"'  --></tt>, <tt><!--rinline '"Regression"'  --></tt> or both. 
  </li>
  <li>
  <span class="mx arg">parameters</span> is a data frame with three simple attributes for each tuning parameter (if any): the argument name (e.g. <span class="mx arg">mtry</span>), the type of data in the parameter grid and textual labels for the parameter.
  </li>
  <li>
  <span class="mx arg">grid</span> is a function that is used to create the tuning grid (unless the user gives the exact values of the parameters via <span class="mx funCall">tuneGrid</span>)
  </li>
  <li>
  <span class="mx arg">fit</span> is a function that fits the model
  </li>
  <li>
  <span class="mx arg">predict</span> is the function that creates predictions
  </li>
  <li>
  <span class="mx arg">prob</span> is a function that can be used to create class probabilities (if applicable)
  </li>
  <li>
  <span class="mx arg">sort</span> is a function that sorts the parameter from most complex to least
  </li>
  <li>
  <span class="mx arg">loop</span> is an <b>optional</b> function for advanced users for models that can create multiple submodel predictions from the same object.
  </li>
  <li>
  <span class="mx arg">levels</span> is an <b>optional</b> function, primarily for classification models using <tt>S4</tt> methods to return the factor levels of the outcome.
  </li>  
  <li>
  <span class="mx arg">tags</span> is an <b>optional</b> character vector that has subjects associated with the model, such as <tt><!--rinline 'Tree-Based Model' --></tt> or <tt><!--rinline 'Embedded Feature Selection' --></tt>. This string is used by the package to create additional documentation pages on the package website. 
  </li>  
  <li>
  <span class="mx arg">label</span> is an <b>optional</b> character string that names the model (e.g. "Linear Discriminant Analysis").
  </li>   
  <li>
  <span class="mx arg">predictors</span> is an <b>optional</b> function that returns a character vector that contains the names of the predictors that we used in the prediction equation.
  </li> 
  <li>
  <span class="mx arg">varImp</span> is an <b>optional</b> function that calculates variable importance metrics for the model (if any).
  </li>    
  <li>
  <span class="mx arg">oob</span> is another <b>optional</b> function that calculates  out-of-bag performance estimates from the model object. Most models do not have this capability but some (e.g. random forests, bagged models) do. 
  </li>    
  <li>
  <span class="mx arg">notes</span> is an <b>optional</b> character vector that can be used to document non-obvious aspects of the model. For example, there are two Bayesian lasso models (<a href="https://github.com/topepo/caret/blob/master/models/files/blasso.R"><tt>blasso</tt></a> and <a href="https://github.com/topepo/caret/blob/master/models/files/blassoAveraged.R"><tt>blassoAveraged</tt></a>) and this field is used to describe the differences between the two models. 
  </li>    
  <li>
  <span class="mx arg">check</span> is an <b>optional</b> function that can be used to check the system/install to make sure that any atypical software requirements are available to the user. The input is <span class="mx arg">pkg</span>, which is the same character string given by the <span class="mx arg">library</span>. This function is run <i>after</i> the checking function to see if the packages specified in <span class="mx arg">library</span> are installed. As an example, the model <a href="https://github.com/topepo/caret/blob/master/models/files/pythonKnnReg.R"><tt>pythonKnnReg</tt></a> uses certain python libraries and the user should have python and these libraries installed. The <a href="https://github.com/topepo/caret/blob/master/models/files/pythonKnnReg.R">model file</a> demonstrates how to check for python libraries prior to running the R model. 
  </li>      
 </ul> 
 
<p>
 In the <a href="http://cran.r-project.org/web/packages/caret/index.html"><strong>caret</strong></a>  package, the subdirectory <tt>models</tt> has all the code for each model that <span class="mx funCall">train</span> interfaces with and these can be used as prototypes for your model. 
</p>
<p>
Let's create a new model for a classification support vector machine using the Laplacian kernel function. We will use the <span class="mx funCall">kernlab</span> package's <span class="mx funCall">ksvm</span> function. The kernel has two parameters: the standard cost parameter for SVMs and one kernel parameter (<tt>sigma</tt>).
</p>
<p>
To start, we'll create a new list:
</p>
<!--begin.rcode custom_basic 
    lpSVM <- list(type = "Classification",
                  library = "kernlab",
                  loop = NULL) 
    end.rcode--> 
<p>
This model can also be used for regression but we will constrain things here for simplicity. For other SVM models, the type value would be <tt><!--rinline 'c("Classification", "Regression")'  --></tt>. 
</p>
<p>
The <span class="mx arg">library</span> value checks to see if this package is installed and loads it whenever it is needed (e.g. before modeling or prediction).
</p>

<h2>The parameters Element</h2>

<p>
We have to create some basic information for the parameters in the form of a data frame. The first column is the name of the parameter. The convention is to use the argument name in the model function (e.g. the <span class="mx funCall">ksvm</span> function here). Those values are <tt>C</tt> and <tt>sigma</tt>. Each is a number and we can give them labels of <tt><span class="string">"Cost"</span></tt> and <tt><span class="string">"Sigma"</span></tt>, respectively. The <span class="mx arg">parameters</span> element would then be:
</p>
<!--begin.rcode custom_labels, tidy=FALSE
prm <- data.frame(parameter = c("C", "sigma"),
                  class = rep("numeric", 2),
                  label = c("Cost", "Sigma"))
    end.rcode--> 
<p>                      
Now we assign it to the model list:
</p>
<!--begin.rcode
    lpSVM$parameters <- prm
    end.rcode--> 
<p>    
Values of <span class="mx arg">type</span> can indicate numeric, character or logical data types. 
</p>

<h2>The grid Element</h2>
<p> 
This should be a function that takes parameters: <span class="mx arg">x</span> and <span class="mx arg">y</span> (for the predictors and outcome data), <span class="mx arg">len</span> (the number of values per tuning parameter)  as well as <span class="mx arg">search</span>. <span class="mx arg">len</span> is the value of <span class="mx arg">tuneLength</span> that is potentially passed in through <span class="mx funCall">train</span>. <span class="mx arg">search</span> can be either <tt><span class="hl str">&quot;grid&quot;</span></tt> or <tt><span class="hl str">&quot;random&quot;</span></tt>. This can be used to setup a grid for searching or random values for random search. 
</p> 
<p> 
The output should be a data frame of tuning parameter combinations with a column for each parameter. The column names should be the parameter name (e.g. the values of <tt>prm$parameter</tt>). In our case, let's vary the cost parameter on the log 2 scale. For the sigma parameter, we can use the <tt>kernlab</tt> function <span class="mx funCall">sigest</span> to pre-estimate the value. Following <span class="mx funCall">ksvm</span> we take the average of the low and high estimates. Here is a function we could use:
</p> 
<!--begin.rcode custom_grid, tidy=FALSE
svmGrid <- function(x, y, len = NULL, search = "grid") {
  library(kernlab)
  ## This produces low, middle and high values for sigma 
  ## (i.e. a vector with 3 elements). 
  sigmas <- sigest(as.matrix(x), na.action = na.omit, scaled = TRUE)  
  ## To use grid search:
  if(search == "grid") {
    out <- expand.grid(sigma = mean(as.vector(sigmas[-2])),
                       C = 2 ^((1:len) - 3))
  } else {
    ## For random search, define ranges for the parameters then
    ## generate random values for them
    rng <- extendrange(log(sigmas), f = .75)
    out <- data.frame(sigma = exp(runif(len, min = rng[1], max = rng[2])),
                      C = 2^runif(len, min = -5, max = 8))
  }
  out
}
   end.rcode--> 
<p> 
Again, the user can pass their own grid via <span class="mx funCall">train</span>'s <span class="mx arg">tuneGrid</span> option or they can use this code to create a default grid. 
</p> 
<p> 
We assign this function to the overall model list:
</p> 
<!--begin.rcode custom_grid2 
    lpSVM$grid <- svmGrid
    end.rcode--> 

<h2>The fit Element</h2>

<p>
Here is where we fit the model. This <span class="mx funCall">fit</span> function has several arguments:
<ul>
  <li>
  <span class="mx arg">x</span>, <span class="mx arg">y</span>: the current data used to fit the model
  </li>
  <li>
  <span class="mx arg">wts</span>: optional instance weights (not applicable for this particular model)
  </li>
  <li>
  <span class="mx arg">param</span>: the current tuning parameter values
  </li>
  <li>
  <span class="mx arg">lev</span>: the class levels of the outcome (or <tt><!--rinline 'NULL' --></tt> in regression)
  </li>
  <li>
  <span class="mx arg">last</span>: a logical for whether the current fit is the final fit
  </li>
  <li>
  <span class="mx arg">weights</span>
  </li>
  <li>
  <span class="mx arg">classProbs</span>: a logical for whether class probabilities should be computed.
  </li>
</ul>
<p>
Here is something we could use for this model:
<p>
<!--begin.rcode custom_fit, tidy=FALSE
svmFit <- function(x, y, wts, param, lev, last, weights, classProbs, ...) { 
  ksvm(x = as.matrix(x), y = y,
       kernel = rbfdot,
       kpar = list(sigma = param$sigma),
       C = param$C,
       prob.model = classProbs,
       ...)
 }
 
lpSVM$fit <- svmFit
    end.rcode--> 
<p>
A few notes about this:
</p>
<ul>
  <li>
  Notice that the package is not loaded in the code. It is loaded prior to this function being called so it won't hurt if you load it again (but that's not needed).
  </li> 
  <li>
  The <span class="mx funCall">ksvm</span> function requires a <i>matrix</i> or predictors. If the original data were a data frame, this would throw and error. 
  </li> 
  <li>
  The tuning parameters are references in the <span class="mx arg">param</span> data frame. There is always a single row in this data frame. 
  </li> 
  <li>
  The probability model is fit based on the value of <span class="mx arg">classProbs</span>. This value is determined by the value given in <span class="mx funCall">trainControl</span>. 
  </li> 
  <li>
  The three dots allow the user to pass options in from <span class="mx funCall">train</span> to, in this case, the <span class="mx funCall">ksvm</span> function. For example, if the user wanted to set the cache size for the function, they could list <span class="mx arg">cache</span><tt> = <!--rinline '80' --></tt> and this argument will be pass from <span class="mx funCall">train</span> to <span class="mx funCall">ksvm</span>.  
  </li> 
  <li> 
  Any pre-processing that was requested in the call to <span class="mx funCall">train</span> have been done. For example, if <span class="mx arg">preProc</span><tt> = <!--rinline '"center"'  --></tt> was originally requested, the columns of <tt>x</tt> seen within this function are mean centered. 
  </li> 
</ul>

<h2>The predict Element</h2>

<p>
This is a function that produces a vector or predictions. In our case, these are class predictions but they could be numbers for regression models. 
</p>
<p>
The arguments are:
</p>
<ul>
  <li>
  <span class="mx arg">modelFit</span>: the model produced by the <tt>fit</tt> code shown above. 
  </li>
  <li>
  <span class="mx arg">newdata</span>: the predictor values of the instances being predicted (e.g. out-of-bag samples)
  </li>
  <li>
  <span class="mx arg">preProc</span> 
  </li>
  <li>
  <span class="mx arg">submodels</span>: this an optional list of tuning parameters only used with the <span class="mx arg">loop</span> element discussed below. In most cases, it will be <tt><!--rinline 'NULL' --></tt>.
  </li>
</ul>
<p>
Our function will be very simple:
</p>
<!--begin.rcode custom_pred, tidy=FALSE
svmPred <- function(modelFit, newdata, preProc = NULL, submodels = NULL)
   predict(modelFit, newdata)
lpSVM$predict <- svmPred
    end.rcode--> 
<p></p>

<p>    
The function <span class="mx funCall">predict.ksvm</span> will automatically create a factor vector as output. The function could also produce character values. Either way, the innards of <span class="mx funCall">train</span> will make them factors and ensure that the same levels as the original data are used. 
</p>

<h2>The prob Element</h2>

<p>
If a regression model is being used or if the classification model does not create class probabilities a value of <tt><!--rinline 'NULL' --></tt> can be used here instead of a function. Otherwise, the function arguments are the same as the <span class="mx arg">pred</span> function. The output should be a matrix or data frame of class probabilities with a column for each class. The column names should be the class levels. 
</p>
<p>
We can use:
</p>

<!--begin.rcode custom_prob, tidy=FALSE
svmProb <- function(modelFit, newdata, preProc = NULL, submodels = NULL)
   predict(modelFit, newdata, type="probabilities")
lpSVM$prob <- svmProb
    end.rcode--> 

<p></p>
<p>
If you look at some of the SVM examples in the <tt>models</tt> directory, the real functions used by <span class="mx funCall">train</span> are much more complicated so that they can deal with model failures, probabilities that do not sum to 1 etc.
</p>

<h2>The sort Element</h2>

<p>
This is an optional function that sorts the tuning parameters from the simplest model to the most complex. There are times where this ordering is not obvious. This information is used when the performance values are tied across multiple parameters. We would probably want to choose the least complex model in those cases. 
</p>

<p>
Here, we will sort by the cost value. Smaller values of <tt>C</tt> produce smoother class boundaries than larger values:
</p>

<!--begin.rcode custom_sort
    svmSort <- function(x) x[order(x$C),]
    lpSVM$sort <- svmSort
    end.rcode--> 


<h2>The levels Element</h2>

<p>
<span class="mx funCall">train</span> ensures that classification models always predict factors with the same levels. To do this at prediction time, the package needs to know the levels from the model object (specifically, the <tt><!--rinline 'finalModels' --></tt> slot of the <span class="mx funCall">train</span> object).
</p>
<p>
For model functions using <code>S3</code> methods, <span class="mx funCall">train</span> automatically attaches a character vector called <tt><!--rinline 'obsLevels' --></tt> to the object and the package code uses this value. However, this strategy does not work for <code>S4</code> methods. In these cases, the package will use the code found in the <tt><!--rinline 'levels' --></tt> slot of the model list. 
</p>
<p>
For example, the <span class="mx funCall">ksvm</span> function uses <code>S4</code> methods but, unlike most model functions, has a built--in function called <span class="mx funCall">lev</span> that will extract the class levels (if any). In this case, our levels code would be:
</p>
<!--begin.rcode custom_lev
    lpSVM$levels <- function(x) lev(x)
    end.rcode--> 
<p>
In most other cases, the levels will beed to be extracted from data contained in the fitted model object. As another example, objects created using the <span class="mx funCall">ctree</span> function in the <span class="mx funCall">party</span> package would need to use:
</p>

<!--begin.rcode custom_lev2, eval=FALSE
    function(x) levels(x@data@get("response")[,1])
    end.rcode--> 
<p>
Again, this slot is only used for classification models using <code>S4</code> methods.
</p>

<p>
We should now be ready to fit our model. 
</p>
<!--begin.rcode custom_sonar, tidy=FALSE,cache=TRUE
library(mlbench)
data(Sonar)
  
library(caret)
set.seed(998)
inTraining <- createDataPartition(Sonar$Class, p = .75, list = FALSE)
training <- Sonar[ inTraining,]
testing  <- Sonar[-inTraining,]

fitControl <- trainControl(method = "repeatedcv",
                           ## 10-fold CV...
                           number = 10,
                           ## repeated ten times
                           repeats = 10)
  
set.seed(825)
Laplacian <- train(Class ~ ., data = training, 
                   method = lpSVM, 
                   preProc = c("center", "scale"),
                   tuneLength = 8,
                   trControl = fitControl)
print(Laplacian, digits = 3)
    end.rcode--> 

<p>
A plot of the data shows that the model doesn't change when the cost value is above 16.
</p>
<!--begin.rcode custom_trainplot, Laplacian,fig.width=7,fig.height=5
trellis.par.set(caretTheme())
plot(Laplacian,  scales = list(x = list(log = 2)))
    end.rcode-->

<div id="Illustration2"></div>
<h1>Illustrative Example 2: Something More Complicated - LogitBoost</h1>

<h2>The loop Element</h2>

<p>
This function can be used to create custom loops for models to tune over. In most cases, the function can just return the existing tuning grid.
</p>
<p>
For example, a <span class="mx funCall">LogitBoost</span> model can be trained over the number of boosting iterations. In the <a href="http://cran.r-project.org/web/packages/caTools/index.html"><strong>caTools</strong></a> package, the <span class="mx funCall">LogitBoost</span> function can be used to fit this model. For example:
</p>
<!--begin.rcode custom_lb, eval=FALSE
mod <- LogitBoost(as.matrix(x), y, nIter = 51)
    end.rcode-->    

<p>
If we were to tune the model evaluating models where the number of iterations was 11, 21, 31, 41 and 51, the grid could be
</p>
<!--begin.rcode custom_lb_grid
lbGrid <- data.frame(nIter = seq(11, 51, by = 10))    
    end.rcode-->    

<p>
During resampling, <span class="mx funCall">train</span> could loop over all five rows in <code>lbGrid</code> and fit five models. However, the <span class="mx funCall">predict.LogitBoost</span> function has an argument called <span class="mx arg">nIter</span> that can produce, in this case, predictions from <code>mod</code> for all five models. 
</p>
<p>
Instead of <span class="mx funCall">train</span> fitting five models, we could fit a single model with <span class="mx arg">nIter</span> = <!--rinline '51' --> and derive predictions for all five models using only <code>mod</code>. 
</p>
<p>
The terminology used here is that  <span class="mx arg">nIter</span> is a <i>sequential</i> tuning parameter (and the other parameters would be considered <i>fixed</i>). 
</p>
<p>
The <span class="mx arg">loop</span> argument for models is used to produce two objects:
</p>
<ul>
 <li> <code>loop</code>: this is the actual loop that is used by `train`. 
 </li>
 <li>
 <code>submodels</code> is a <i>list</i> that has as many elements as there are rows in <code>loop</code>. The list has all the "extra" parameter settings that can be derived for each model.
 </li>
</ul>
<p>
Going back to the <span class="mx funCall">LogitBoost</span> example, we could have:
</p>
<!--begin.rcode custom_lb_loops
    loop <- data.frame(.nIter = 51)
loop
    submodels <- list(data.frame(nIter = seq(11, 41, by = 10)))   
submodels
    end.rcode-->
<p>
For this case, <span class="mx funCall">train</span> first fits the <span class="mx arg">nIter</span><tt> = <!--rinline '51' --></tt> model. When the model is predicted, that code has a <span class="mx funCall">for</span> loop that iterates over the elements of <code>submodel[[1]]</code> to get the predictions for the other 4 models. 
</p>
<p>
In the end, predictions for all five models (for <span class="mx arg">nIter</span><tt> = <!--rinline 'seq(11, 51, by = 10)' --></tt>) with a single model fit. 
</p>
<p>
There are other models built-in to <a href="http://cran.r-project.org/web/packages/caret/index.html"><strong>caret</strong></a> that are used this way. There are a number of models that have multiple sequential tuning parameters.
</p>
<p>
If the <span class="mx arg">loop</span> argument is left <code><!--rinline 'NULL' --></code> the results of <span class="mx arg">tuneGrid</span> are used as the simple loop and is recommended for most situations. Note that the machinery that is used to "derive" the extra predictions is up to the user to create, typically in the <span class="mx arg">predict</span> and <span class="mx arg">prob</span>  elements of the custom model object. 
</p>
<p>    
For the <span class="mx funCall">LogitBoost</span> model, some simple code to create these objects would be:
</p>

<!--begin.rcode custom_lb_loop_funcs
    fullGrid <- data.frame(nIter = seq(11, 51, by = 10))

    ## Get the largest value of nIter to fit the "full" model
    loop <- fullGrid[which.max(fullGrid$nIter),,drop = FALSE]
    loop

    submodels <- fullGrid[-which.max(fullGrid$nIter),,drop = FALSE]

    ## This needs to be encased in a list in case there are more
    ## than one tuning parameter
    submodels <- list(submodels)  
    submodels
    end.rcode-->   
<p>
For the <span class="mx funCall">LogitBoost</span> custom model object, we could use this code in the <span class="mx arg">predict</span> slot:
<p>
<!--begin.rcode custom_lb_pred, tidy=FALSE, cache=TRUE
lbPred <- function(modelFit, newdata, preProc = NULL, submodels = NULL) {
  ## This model was fit with the maximum value of nIter
  out <- caTools::predict.LogitBoost(modelFit, newdata, type="class")
  
  ## In this case, 'submodels' is a data frame with the other values of
  ## nIter. We loop over these to get the other predictions.
  if(!is.null(submodels)) {
    ## Save _all_ the predictions in a list
    tmp <- out
    out <- vector(mode = "list", length = nrow(submodels) + 1)
    out[[1]] <- tmp
    
    for(j in seq(along = submodels$nIter)) {
      out[[j+1]] <- caTools::predict.LogitBoost(modelFit,
                                                newdata,
                                                nIter = submodels$nIter[j])
      }
    }
  out                   
  }
    end.rcode--> 

<p>
A few more notes:
</p>
<ul>
 <li> 
 The code in the <span class="mx arg">fit</span> element does not have to change. 
 </li>
 <li>
 The <span class="mx arg">prob</span> slot works in the same way. The only difference is that the values saved in the outgoing lists are matrices or data frames of probabilities for each class. 
  </li>
  <li>
  After model training (i.e. predicting new samples), the value of <span class="mx arg">submodels</span> is set to <code><!--rinline 'NULL' --></code> and the code produces a single set of predictions.
  </li>
  <li>
  If the model had one sequential parameter and one fixed parameter, the <code>loop</code> data frame would have two columns (one for each parameter). If the model is tuned over more than one value of the fixed parameter, the <code>submodels</code> list would have more than one element. If <code>loop</code> had 10 rows, then <tt><!--rinline 'length(submodels)'  --></tt> would be <tt>10</tt> and <tt><!--rinline 'loop[i,]' --></tt> would be linked to <tt><!--rinline 'submodels[[i]]' --></tt>. 
  </li>
</ul>

<p> Here is a slimmed down version of the logitBoost code already in the package:
</p>

<!--begin.rcode custom_lb_funcs, tidy=FALSE
lbFuncs <- list(library = "caTools",
                loop = function(grid) {            
                  loop <- grid[which.max(grid$nIter),,drop = FALSE]
                  submodels <- grid[-which.max(grid$nIter),,drop = FALSE]
                  submodels <- list(submodels)  
                  list(loop = loop, submodels = submodels)
                },
                type = "Classification",
                parameters = data.frame(parameter = 'nIter',
                                        class = 'numeric',
                                        label = '# Boosting Iterations'),
                grid = function(x, y, len = NULL, search = "grid") {
                  out <- if(search == "grid") 
                    data.frame(nIter = 1 + ((1:len)*10)) else 
                    data.frame(nIter = sample(1:500, size = len))
                  out
                },
                fit = function(x, y, wts, param, lev, last, weights, classProbs, ...) {
                  LogitBoost(as.matrix(x), y, nIter = param$nIter)
                },
                predict = function(modelFit, newdata, preProc = NULL, submodels = NULL) {
                  out <- predict(modelFit, newdata, type="class")
                  if(!is.null(submodels)) {                   
                    tmp <- out
                    out <- vector(mode = "list", length = nrow(submodels) + 1)
                    out[[1]] <- tmp
                    
                    for(j in seq(along = submodels$nIter)) {
                      out[[j+1]] <- predict(modelFit,
                                            newdata,
                                            nIter = submodels$nIter[j])
                    }
                  }
                  out                   
                },
                prob = NULL,
                sort = function(x) x)
    end.rcode--> 

<p>Should you care about this? Let's tune the model over the same data set used for the SVM model above and see how long it takes:
</p>

<!--begin.rcode custom_reset_workers1, echo=FALSE, results = 'hide'
registerDoSEQ()
 end.rcode-->   
<!--begin.rcode custom_lb_mod, tidy=FALSE
set.seed(825)
lb1 <- system.time(train(Class ~ ., data = training, 
                         method = lbFuncs, 
                         tuneLength = 3,
                         trControl = fitControl))
lb1

## Now get rid of the submodel parts
lbFuncs2 <- lbFuncs
lbFuncs2$predict = function(modelFit, newdata, preProc = NULL, submodels = NULL) 
  predict(modelFit, newdata, type="class")
lbFuncs2$loop <- NULL 

set.seed(825)
lb2 <- system.time(train(Class ~ ., data = training, 
                         method = lbFuncs2, 
                         tuneLength = 3,
                         trControl = fitControl))
lb2
end.rcode--> 
<p>On a data set with <!--rinline I(nrow(training)) --> instances and <!--rinline I(ncol(training) - 1) --> predictors and a model that is tuned over only 3 parameter values, there is a <!--rinline I(round(lb2[3]/lb1[3],2))-->-fold speed-up. If the model were more computationally taxing or the data set were larger or the number of tune parameters that were evaluated was larger, the speed-up would increase. Here is a plot of the speed-up for a few more values of <span class="mx arg">tuneLength</span>:
</p>

<!--begin.rcode custom_lb_times, tidy=FALSE,fig.width=7,fig.height=5
bigGrid <- data.frame(nIter = seq(1, 151, by = 10))
results <- bigGrid
results$SpeedUp <- NA

for(i in 2:nrow(bigGrid)){ 
  rm(lb1, lb2)
  set.seed(825)
  lb1 <- system.time(train(Class ~ ., data = training, 
                           method = lbFuncs, 
                           tuneGrid = bigGrid[1:i,,drop = FALSE],
                           trControl = fitControl))
  
  set.seed(825)
  lb2 <- system.time(train(Class ~ ., data = training, 
                           method = lbFuncs2, 
                           tuneGrid = bigGrid[1:i,,drop = FALSE],
                           trControl = fitControl))
  results$SpeedUp[i] <- lb2[3]/lb1[3]
  }

trellis.par.set(caretTheme())
xyplot(SpeedUp ~ nIter, 
       data = results,
       xlab = "LogitBoost Iterations", 
       ylab = "Speed-Up",
       type = c("p", "g", "r"))
end.rcode--> 
<!--begin.rcode custom_reset_workers2, echo=FALSE, results = 'hide'
registerDoMC(cores=detectCores()-1)
 end.rcode-->  
<p>
The speed-ups show a significant decrease in training time using this method.
</p>

<div id="Illustration3"></div>
<h1>Illustrative Example 3: Nonstandard Formulas</h1>

<p>
(Note: the previous third illustration ("SMOTE During Resampling") is no longer needed due to the inclusion of subsampling via <span class="mx funCall">train</span>.)
</p>
<p>
One limitation of <span class="mx funCall">train</span> is that it requires the use of basic model formulas. There are several functions that use special formulas or operators on predictors that won't  (and perhaps should not) work in the top level call to <span class="mx funCall">train</span>. However, we can still fit these models. 
</p>
<p>
Here is an example using the <span class="mx funCall">mboost</span> function in the <strong>mboost</strong> package from the help page. 
</p>
<!--begin.rcode custom_mboost_orig, tidy=FALSE
library(mboost)
data("bodyfat", package = "TH.data")
mod <- mboost(DEXfat ~ btree(age) + bols(waistcirc) + bbs(hipcirc),
              data = bodyfat)
mod
end.rcode--> 

<p> 
We can create a custom model that mimics this code so that we can obtain resampling estimates for this specific model:
</p>

<!--begin.rcode custom_mboost_code, tidy=FALSE
modelInfo <- list(label = "Model-based Gradient Boosting",
                  library = "mboost",
                  type = "Regression",
                  parameters = data.frame(parameter = "parameter",
                                          class = "character",
                                          label = "parameter"),
                  grid = function(x, y, len = NULL, search = "grid") 
                    data.frame(parameter = "none"),
                  loop = NULL,
                  fit = function(x, y, wts, param, lev, last, classProbs, ...) {          
                    ## mboost requires a data frame with predictors and response
                    dat <- if(is.data.frame(x)) x else as.data.frame(x)
                    dat$DEXfat <- y
                    mod <- mboost(DEXfat ~ btree(age) + bols(waistcirc) + bbs(hipcirc),
                                  data = dat)
                    },
                  predict = function(modelFit, newdata, submodels = NULL) {
                    if(!is.data.frame(newdata)) newdata <- as.data.frame(newdata)
                    ## By default a matrix is returned; we convert it to a vector
                    predict(modelFit, newdata)[,1]
                  },
                  prob = NULL,
                  predictors = function(x, ...) {
                    unique(as.vector(variable.names(x)))
                  },
                  tags = c("Ensemble Model", "Boosting", "Implicit Feature Selection"),
                  levels = NULL,
                  sort = function(x) x)

end.rcode--> 

<!--begin.rcode custom_mboost_cv, tidy=FALSE
## Just use the basic formula method so that these predictors
## are passed 'as-is' into the model fitting and prediction
## functions.
set.seed(307)
mboost_resamp <- train(DEXfat ~ age + waistcirc + hipcirc, 
                       data = bodyfat, 
                       method = modelInfo,
                       trControl = trainControl(method = "repeatedcv",
                                                repeats = 5))
mboost_resamp
end.rcode--> 


<div id="Illustration4"></div>
<h1>Illustrative Example 4: PLS Feature Extraction Pre-Processing</h1>

<p>PCA is a common tool for feature extraction prior to modeling but is <i>unsupervised</i>. Partial Least Squares (PLS) is essentially a supervised version of PCA. For some data sets, there may be some benefit to using PLS to generate new features from the original data (the PLS scores) then use those as an input into a different predictive model. PLS requires parameter tuning. In the example below, we use PLS on a data set with highly correlated predictors then use the PLS scores in a random forest model.
</p>
<p>The "trick" here is to save the PLS loadings along with the random forest model fit so that the loadings can be used on future samples for prediction. Also, the PLS and random forest models are <i>jointly</i> tuned instead of an initial modeling process that finalizes the PLS model, then builds the random forest model separately. In this was we optimize both at once. Another important point is that the resampling results reflect the variability in the random forest <i>and</i> PLS models. If we did PLS up-front then resampled the random forest model, we would under-estimate the noise in the modeling process. 
</p>
<p>The tecator spectroscopy data are used:
</p>
<!--begin.rcode custom_meats, tidy=FALSE
data(tecator)

set.seed(930)

## We will model the protein content data
trainMeats <- createDataPartition(endpoints[,3], p = 3/4)
absorpTrain  <- absorp[trainMeats[[1]], ]
proteinTrain <- endpoints[trainMeats[[1]], 3]
absorpTest   <- absorp[-trainMeats[[1]], ]
proteinTest  <- endpoints[-trainMeats[[1]], 3]
    end.rcode--> 

<p>Here is the model code:</p>
<!--begin.rcode custom_pls_rf, tidy=FALSE
pls_rf <- list(label = "PLS-RF",
               library = c("pls", "randomForest"),
               type = "Regression",
               ## Tune over both parameters at the same time
               parameters = data.frame(parameter = c('ncomp', 'mtry'),
                                       class = c("numeric", 'numeric'),
                                       label = c('#Components', 
                                                 '#Randomly Selected Predictors')),
               grid = function(x, y, len = NULL, search = "grid") {
                 if(search == "grid") {
                   grid <- expand.grid(ncomp = seq(1, min(ncol(x) - 1, len), by = 1),
                                       mtry = 1:len)
                   } else {
                     grid <- expand.grid(ncomp = sample(1:ncol(x), size = len),
                                         mtry = sample(1:ncol(x), size = len))
                     }
                 ## We can't have mtry > ncomp
                 grid <- subset(grid, mtry <= ncomp)
                 },
               loop = NULL,
               fit = function(x, y, wts, param, lev, last, classProbs, ...) { 
                 ## First fit the pls model, generate the training set scores,
                 ## then attach what is needed to the random forest object to 
                 ## be used later
                 
                 ## plsr only has a formula interface so create one data frame
                 dat <- if(!is.data.frame(x)) x <- as.data.frame(x)
                 dat$y <- y
                 pre <- plsr(y~ ., data = dat, ncomp = param$ncomp)
                 scores <- predict(pre, x, type = "scores")
                 colnames(scores) <- paste("score", 1:param$ncomp, sep = "")
                 mod <- randomForest(scores, y, mtry = param$mtry, ...)
                 mod$projection <- pre$projection
                 mod
                 },
               predict = function(modelFit, newdata, submodels = NULL) {  
                 ## Now apply the same scaling to the new samples
                 scores <- as.matrix(newdata)  %*% modelFit$projection
                 colnames(scores) <- paste("score", 1:ncol(scores), sep = "")
                 scores <- as.data.frame(scores)
                 ## Predict the random forest model
                 predict(modelFit, scores)
                 },
               prob = NULL,
               varImp = NULL,
               predictors = function(x, ...) rownames(x$projection),
               levels = function(x) x$obsLevels,
               sort = function(x) x[order(x[,1]),])
    end.rcode--> 
<p>We fit the models and look at the resampling results for the joint model:</p>
<!--begin.rcode custom_meat_mod1, tidy=FALSE,fig.width=7,fig.height=5
meatCtrl <- trainControl(method = "repeatedcv", repeats = 5)

## These will take a while for these data
set.seed(184)
plsrf <- train(absorpTrain, proteinTrain, 
               method = pls_rf,
               preProc = c("center", "scale"),
               tuneLength = 10,
               ntree = 1000,
               trControl = meatCtrl)
ggplot(plsrf, plotType = "level")
    end.rcode--> 
<!--begin.rcode custom_meat_mod2, tidy=FALSE
## How does random forest do on its own?
set.seed(184)
rfOnly <- train(absorpTrain, proteinTrain, 
                method = "rf",
                tuneLength = 10,
                ntree = 1000,
                trControl = meatCtrl)
getTrainPerf(rfOnly)

## How does random forest do on its own?
set.seed(184)
plsOnly <- train(absorpTrain, proteinTrain, 
                 method = "pls",
                 tuneLength = 20,
                 preProc = c("center", "scale"),
                 trControl = meatCtrl)
getTrainPerf(plsOnly)
    end.rcode--> 
<p>The test set results indicate that these data like the linear model more than anything:</p>
<!--begin.rcode custom_meat_test, tidy=FALSE
postResample(predict(plsrf, absorpTest), proteinTest)
postResample(predict(rfOnly, absorpTest), proteinTest)
postResample(predict(plsOnly, absorpTest), proteinTest)
    end.rcode--> 

<div id="Illustration5"></div>
<h1>Illustrative Example 5: Optimizing probability thresholds for class imbalances</h1>

<p>This description was originally posted on <a href="http://appliedpredictivemodeling.com/blog/">this blog.</a></p>

<p>
One of the toughest problems in predictive model occurs when the classes have a severe imbalance. In <a href="http://appliedpredictivemodeling.com/">our book</a>, we spend <a href="http://rd.springer.com/chapter/10.1007/978-1-4614-6849-3_16">an entire chapter</a> on this subject itself. One consequence of this is that the performance is generally very biased against the class with the smallest frequencies. For example, if the data have a majority of samples belonging to the first class and very few in the second class, most predictive models will maximize accuracy by predicting everything to be the first class. As a result there's usually great sensitivity but poor specificity.

As a demonstration will use a simulation system <a href="http://appliedpredictivemodeling.com/blog/2013/4/11/a-classification-simulation-system">described here</a>. By default it has about a 50-50 class frequency but we can change this by altering the function argument called <tt>intercept</tt>:

<!--begin.rcode custom_thresh_data, tidy=FALSE
library(caret)

set.seed(442)
trainingSet <- twoClassSim(n = 500, intercept = -16)
testingSet  <- twoClassSim(n = 500, intercept = -16)

## Class frequencies
table(trainingSet$Class)
    end.rcode--> 
    
<p>
There is almost a 9:1 imbalance in these data.

Let's use a standard random forest model with these data using the default value of <tt>mtry</tt>. We'll also use repeated 10-fold cross validation to get a sense of performance:
</p>

<!--begin.rcode custom_thresh_mod0, tidy=FALSE,fig.width=6,fig.height=6
set.seed(949)
mod0 <- train(Class ~ ., data = trainingSet,
              method = "rf",
              metric = "ROC",
              tuneGrid = data.frame(mtry = 3),
              ntree = 1000,
              trControl = trainControl(method = "repeatedcv",
                                       repeats = 5,
                                       classProbs = TRUE,
                                       summaryFunction = twoClassSummary))
getTrainPerf(mod0)

## Get the ROC curve
roc0 <- roc(testingSet$Class, 
            predict(mod0, testingSet, type = "prob")[,1], 
            levels = rev(levels(testingSet$Class)))
roc0

## Now plot
plot(roc0, print.thres = c(.5), type = "S",
     print.thres.pattern = "%.3f (Spec = %.2f, Sens = %.2f)",
     print.thres.cex = .8, 
     legacy.axes = TRUE)
    end.rcode--> 
    
<p>
The area under the ROC curve is very high, indicating that the model has very good predictive power for these data. The plot shows the default probability cut off value of 50%. The sensitivity and specificity values associated with this point indicate that performance is not that good when an actual call needs to be made on a sample. 
</p>
<p>
One of the most common ways to deal with this is to determine an alternate probability cut off using the ROC curve. But to do this well, another set of data (not the test set) is needed to set the cut off and the test set is used to validate it. We don't have a lot of data this is difficult since we will be spending some of our data just to get a single cut off value.
</p>
<p>
Alternatively the model can be tuned, using resampling, to determine any model tuning parameters as well as an appropriate cut off for the probabilities. 
</p>
<p>
Suppose the model has one tuning parameter and we want to look at four candidate values for tuning. Suppose we also want to tune the probability cut off over 20 different thresholds. Now we have to look at 20×4=80 different models (and that is for each resample). One other feature that has been opened up his ability to use sequential parameters: these are tuning parameters that don't require a completely new model fit to produce predictions. In this case, we can fit one random forest model and get it's predicted class probabilities and evaluate the candidate probability cutoffs using these same hold-out samples. Here is what the model code looks like:
</p>
<!--begin.rcode custom_thresh_code, tidy=FALSE,
## Get the model code for the original random forest method:

thresh_code <- getModelInfo("rf", regex = FALSE)[[1]]
thresh_code$type <- c("Classification")
## Add the threshold as another tuning parameter
thresh_code$parameters <- data.frame(parameter = c("mtry", "threshold"),
                                     class = c("numeric", "numeric"),
                                     label = c("#Randomly Selected Predictors",
                                               "Probability Cutoff"))
## The default tuning grid code:
thresh_code$grid <- function(x, y, len = NULL, search = "grid") {
  p <- ncol(x)
  if(search == "grid") {
    grid <- expand.grid(mtry = floor(sqrt(p)), 
                        threshold = seq(.01, .99, length = len))
    } else {
      grid <- expand.grid(mtry = sample(1:p, size = len),
                          threshold = runif(1, 0, size = len))
      }
  grid
  }

## Here we fit a single random forest model (with a fixed mtry)
## and loop over the threshold values to get predictions from the same
## randomForest model.
thresh_code$loop = function(grid) {   
  library(plyr)
  loop <- ddply(grid, c("mtry"),
                function(x) c(threshold = max(x$threshold)))
  submodels <- vector(mode = "list", length = nrow(loop))
  for(i in seq(along = loop$threshold)) {
    index <- which(grid$mtry == loop$mtry[i])
    cuts <- grid[index, "threshold"] 
    submodels[[i]] <- data.frame(threshold = cuts[cuts != loop$threshold[i]])
    }    
  list(loop = loop, submodels = submodels)
  }

## Fit the model independent of the threshold parameter
thresh_code$fit = function(x, y, wts, param, lev, last, classProbs, ...) { 
  if(length(levels(y)) != 2)
    stop("This works only for 2-class problems")
  randomForest(x, y, mtry = param$mtry, ...)
  }

## Now get a probability prediction and use different thresholds to
## get the predicted class
thresh_code$predict = function(modelFit, newdata, submodels = NULL) {
  class1Prob <- predict(modelFit, 
                        newdata, 
                        type = "prob")[, modelFit$obsLevels[1]]
  ## Raise the threshold for class #1 and a higher level of
  ## evidence is needed to call it class 1 so it should 
  ## decrease sensitivity and increase specificity
  out <- ifelse(class1Prob >= modelFit$tuneValue$threshold,
                modelFit$obsLevels[1], 
                modelFit$obsLevels[2])
  if(!is.null(submodels)) {
    tmp2 <- out
    out <- vector(mode = "list", length = length(submodels$threshold))
    out[[1]] <- tmp2
    for(i in seq(along = submodels$threshold)) {
      out[[i+1]] <- ifelse(class1Prob >= submodels$threshold[[i]],
                           modelFit$obsLevels[1], 
                           modelFit$obsLevels[2])
      }
    } 
  out  
  }

## The probabilities are always the same but we have to create
## mulitple versions of the probs to evaluate the data across
## thresholds
thresh_code$prob = function(modelFit, newdata, submodels = NULL) {
  out <- as.data.frame(predict(modelFit, newdata, type = "prob"))
  if(!is.null(submodels)) {
    probs <- out
    out <- vector(mode = "list", length = length(submodels$threshold)+1)
    out <- lapply(out, function(x) probs)
    } 
  out 
  }
    end.rcode--> 
    
<p>
Basically, we define a list of model components (such as the fitting code, the prediction code, etc.) and feed this into the train function instead of using a pre-listed model string (such as <tt><!--rinline 'method = "rf"'  --></tt>). For this model and these data, there was an 8% increase in training time to evaluate 20 additional values of the probability cut off.
</p>
<p>
How do we optimize this model? Normally we might look at the area under the ROC curve as a metric to choose our final values. In this case the ROC curve is independent of the probability threshold so we have to use something else. A common technique to evaluate a candidate threshold is see how close it is to the perfect model where sensitivity and specificity are one. Our code will use the distance between the current model's performance and the best possible performance and then have train minimize this distance when choosing it's parameters. Here is the code that we use to calculate this:
</p>
<!--begin.rcode custom_thresh_dist, tidy=FALSE,results='hide'
fourStats <- function (data, lev = levels(data$obs), model = NULL) {
  ## This code will get use the area under the ROC curve and the
  ## sensitivity and specificity values using the current candidate
  ## value of the probability threshold.
  out <- c(twoClassSummary(data, lev = levels(data$obs), model = NULL))
  
  ## The best possible model has sensitivity of 1 and specificity of 1. 
  ## How far are we from that value?
  coords <- matrix(c(1, 1, out["Spec"], out["Sens"]), 
                   ncol = 2, 
                   byrow = TRUE)
  colnames(coords) <- c("Spec", "Sens")
  rownames(coords) <- c("Best", "Current")
  c(out, Dist = dist(coords)[1])
}

set.seed(949)
mod1 <- train(Class ~ ., data = trainingSet,
              method = thresh_code,
              ## Minimize the distance to the perfect model
              metric = "Dist",
              maximize = FALSE,
              tuneLength = 20,
              ntree = 1000,
              trControl = trainControl(method = "repeatedcv",
                                       repeats = 5,
                                       classProbs = TRUE,
                                       summaryFunction = fourStats))

mod1
    end.rcode--> 
<!--begin.rcode custom_thresh_dist_print, tidy=FALSE,echo=FALSE
print(mod1, digits = 3)
    end.rcode--> 

<p>
results
</p>
<p>
Using <tt><!--rinline 'ggplot(mod1)'  --></tt> will show the performance profile. Instead here is a plot of the sensitivity, specificity, and distance to the perfect model:
</p>
<!--begin.rcode custom_thresh_mod1, tidy=FALSE,fig.width=7,fig.height=5
library(reshape2)
metrics <- mod1$results[, c(2, 4:6)]
metrics <- melt(metrics, id.vars = "threshold", 
                variable.name = "Resampled",
                value.name = "Data")

ggplot(metrics, aes(x = threshold, y = Data, color = Resampled)) + 
  geom_line() + 
  ylab("") + xlab("Probability Cutoff") +
  theme(legend.position = "top")
    end.rcode--> 
<p>
You can see that as we increase the probability cut off for the first class it takes more and more evidence for a sample to be predicted as the first class. As a result the sensitivity goes down when the threshold becomes very large. The upside is that we can increase specificity in the same way. The blue curve shows the distance to the perfect model. The value of <!--rinline I(round(mod1$bestTune$threshold, 2)) --> was found to be optimal. 
</p>
<p>
Now we can use the test set ROC curve to validate the cut off we chose by resampling. Here the cut off closest to the perfect model is <!--rinline I(round(coords(roc0, x = "best", ret="threshold", best.method="closest.topleft"), 2)) -->. We were able to find a good probability cut off value without setting aside another set of data for tuning the cut off.
</p>
<p>
One great thing about this code is that it will automatically apply the optimized probability threshold when predicting new samples. 
</p>

 
<div id="Illustration6"></div>
<h1>Illustrative Example 6: Offsets in Generalized Linear Models</h1>

<p>
Like the <span class="mx funCall">mboost</span> example <a href="#Illustration3">above</a>, a custom method is required since a formula element is used to set the offset variable. Here is an example from <tt>?</tt><span class="mx funCall">glm</span>:
</p>
<!--begin.rcode custom_glm_offset, tidy=FALSE,echo=FALSE
utils::data(anorexia, package = "MASS")

anorex.1 <- glm(Postwt ~ Prewt + Treat + offset(Prewt), data = anorexia)
coef(anorex.1)
    end.rcode--> 
<p>
We can write a small custom method to duplicate this model. Two details of note:
</p>
<ul>
  <li>
  If we have factors in the data and do not want <span class="mx funCall">train</span> to convert them to dummy variables, the formula method for <span class="mx funCall">train</span> should be avoided. We can let <span class="mx funCall">glm</span> do that inside the custom method. This would help <span class="mx funCall">glm</span> understand that the dummy variable columns came from the same original factor. This will avoid errors in other functions used with <span class="mx funCall">glm</span> (e.g. <span class="mx funCall">anova</span>).
  </li>
  <li>
  The slot for <span class="mx arg">x</span> should include any variables that are on the right-hand side of the model formula, including the offset column. 
  </li>
</ul>
<p>
Here is the custom model:
</p>
<!--begin.rcode custom_train_offset, tidy=FALSE
offset_mod <- getModelInfo("glm", regex = FALSE)[[1]]
offset_mod$fit <- function(x, y, wts, param, lev, last, classProbs, ...) {
  dat <- if(is.data.frame(x)) x else as.data.frame(x)
  dat$Postwt <- y
  glm(Postwt ~ Prewt + Treat + offset(Prewt), family = gaussian, data = dat)
}

mod <- train(x = anorexia[, 1:2], y = anorexia$Postwt, method = offset_mod)
coef(mod$finalModel)
    end.rcode--> 
   
<div style="clear: both;">&nbsp;</div>
  </div>
  <!-- end #content -->
<div id="sidebar">
<ul>
  <li>
    <h2>General Topics</h2>
    <ul>
      <li><a href="index.html">Front Page</a></li>
      <li><a href="visualizations.html">Visualizations</a></li>
      <li><a href="preprocess.html">Pre-Processing</a><li>
      <li><a href="splitting.html">Data Splitting</a></li>
      <li><a href="varimp.html">Variable Importance</a></li>
      <li><a href="other.html">Model Performance</a></li>
      <li><a href="parallel.html">Parallel Processing</a></li>
    </ul>
    <h2>Model Training and Tuning</h2>
    <ul>
      <li><a href="training.html">Basic Syntax</a></li>
      <li><a href="modelList.html">Sortable Model List</a></li>
      <li><a href="bytag.html">Models By Tag</a></li>
      <li><a href="similarity.html">Models By Similarity</a></li>
      <li><a href="custom_models.html">Using Custom Models</a></li>
      <li><a href="sampling.html">Sampling for Class Imbalances</a></li> 
      <li><a href="random.html">Random Search</a></li> 
      <li><a href="adaptive.html">Adaptive Resampling</a></li> 
    </ul>
    <h2>Feature Selection</h2>
    <ul>
      <li><a href="featureselection.html">Overview</a>
      <li><a href="rfe.html">RFE</a></li>
      <li><a href="filters.html">Filters</a></li>
      <li><a href="GA.html">GA</a></li>
      <li><a href="SA.html">SA</a></li>
    </ul>  
  </li>
</ul>
</div>
<!-- end #sidebar -->
<div style="clear: both;">&nbsp;</div>
  </div>
  <div class="container"><img src="images/img03.png" width="1000" height="40" alt="" /></div>
  <!-- end #page -->
</div>
  <div id="footer-content"></div>
<!--begin.rcode custom_session, echo = FALSE
knit_hooks$set(inline = hook_inline)    
    end.rcode-->   
  <div id="footer">
  <p>Created on <!--rinline I(session) -->.</p>
  </div>
  <!-- end #footer -->
</body>
  </html>
